#loading the required libraries 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(skimr)
library(DataExplorer)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(leaps)
library(caret)
## Loading required package: lattice
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-6
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(DiagrammeR)
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

“Part 1: Read, Explore, and Clean Data”

#load dataset
dataset <- read.csv("Data.csv")

head(dataset)
##         DATE UNRATE... CONSUMER.CONF.INDEX PPI.CONST.MAT. CPIALLITEMS
## 1 01-05-2022       3.6               106.4        352.857    123.3228
## 2 01-04-2022       3.6               107.3        343.730    121.9782
## 3 01-03-2022       3.6               107.2        345.852    121.3010
## 4 01-02-2022       3.8               110.5        343.583    119.7028
## 5 01-01-2022       4.0               113.8        345.742    118.6193
## 6 01-12-2021       3.9               115.8        335.032    117.6295
##   INFLATION... MORTGAGE.INT..MONTHLY.AVG... MED.HOUSEHOLD.INCOME
## 1     8.581511                       5.2300                   NA
## 2     8.258629                       4.9825                   NA
## 3     8.542456                       4.1720                   NA
## 4     7.871064                       3.7625                   NA
## 5     7.479872                       3.4450                   NA
## 6     7.036403                       3.0980                   NA
##   CORP..BOND.YIELD... MONTHLY.HOME.SUPPLY X..SHARE.OF.WORKING.POPULATION
## 1                4.13                 8.4                             NA
## 2                3.76                 8.4                             NA
## 3                3.43                 7.0                             NA
## 4                3.25                 6.0                             NA
## 5                2.93                 5.7                             NA
## 6                2.65                 5.6                       64.92413
##   GDP.PER.CAPITA QUARTERLY.REAL.GDP QUARTERLY.GDP.GROWTH.RATE.... CSUSHPISA
## 1          74737           19699.47                    -0.1442271   120.724
## 2          74737           19699.47                    -0.1442271   121.813
## 3          73289           19727.92                    -0.3956925   122.888
## 4          73289           19727.92                    -0.3956925   123.831
## 5          73289           19727.92                    -0.3956925   124.780
## 6          72171           19806.29                     1.6807783   125.734
#check dimensions of data set
dim(dataset)
## [1] 241  15
# Get the structure of data set(We could have also used glimpse() here but structure function is easier to read)
str(dataset)
## 'data.frame':    241 obs. of  15 variables:
##  $ DATE                          : chr  "01-05-2022" "01-04-2022" "01-03-2022" "01-02-2022" ...
##  $ UNRATE...                     : num  3.6 3.6 3.6 3.8 4 3.9 4.2 4.6 4.7 5.2 ...
##  $ CONSUMER.CONF.INDEX           : num  106 107 107 110 114 ...
##  $ PPI.CONST.MAT.                : num  353 344 346 344 346 ...
##  $ CPIALLITEMS                   : num  123 122 121 120 119 ...
##  $ INFLATION...                  : num  8.58 8.26 8.54 7.87 7.48 ...
##  $ MORTGAGE.INT..MONTHLY.AVG...  : num  5.23 4.98 4.17 3.76 3.44 ...
##  $ MED.HOUSEHOLD.INCOME          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ CORP..BOND.YIELD...           : num  4.13 3.76 3.43 3.25 2.93 2.65 2.62 2.68 2.53 2.55 ...
##  $ MONTHLY.HOME.SUPPLY           : num  8.4 8.4 7 6 5.7 5.6 6.2 6.9 6.1 6.5 ...
##  $ X..SHARE.OF.WORKING.POPULATION: num  NA NA NA NA NA ...
##  $ GDP.PER.CAPITA                : int  74737 74737 73289 73289 73289 72171 72171 72171 69824 69824 ...
##  $ QUARTERLY.REAL.GDP            : num  19699 19699 19728 19728 19728 ...
##  $ QUARTERLY.GDP.GROWTH.RATE.... : num  -0.144 -0.144 -0.396 -0.396 -0.396 ...
##  $ CSUSHPISA                     : num  121 122 123 124 125 ...
#summary of data 
summary(dataset)
##      DATE             UNRATE...      CONSUMER.CONF.INDEX PPI.CONST.MAT. 
##  Length:241         Min.   : 3.500   Min.   : 25.00      Min.   :143.8  
##  Class :character   1st Qu.: 4.700   1st Qu.: 70.40      1st Qu.:183.3  
##  Mode  :character   Median : 5.600   Median : 94.50      Median :206.2  
##                     Mean   : 6.075   Mean   : 90.81      Mean   :206.9  
##                     3rd Qu.: 7.300   3rd Qu.:108.20      3rd Qu.:221.7  
##                     Max.   :14.700   Max.   :138.40      Max.   :352.9  
##                                                                         
##   CPIALLITEMS      INFLATION...    MORTGAGE.INT..MONTHLY.AVG...
##  Min.   : 75.86   Min.   :-2.097   Min.   :2.684               
##  1st Qu.: 87.72   1st Qu.: 1.464   1st Qu.:3.803               
##  Median : 96.82   Median : 2.071   Median :4.457               
##  Mean   : 95.54   Mean   : 2.296   Mean   :4.698               
##  3rd Qu.:103.26   3rd Qu.: 2.970   3rd Qu.:5.812               
##  Max.   :123.32   Max.   : 8.582   Max.   :6.806               
##                                                                
##  MED.HOUSEHOLD.INCOME CORP..BOND.YIELD... MONTHLY.HOME.SUPPLY
##  Min.   :42409        Min.   :2.140       Min.   : 3.300     
##  1st Qu.:49007        1st Qu.:3.690       1st Qu.: 4.600     
##  Median :50303        Median :4.340       Median : 5.500     
##  Mean   :53274        Mean   :4.471       Mean   : 5.974     
##  3rd Qu.:59039        3rd Qu.:5.410       3rd Qu.: 6.700     
##  Max.   :68703        Max.   :6.750       Max.   :12.200     
##  NA's   :17                                                  
##  X..SHARE.OF.WORKING.POPULATION GDP.PER.CAPITA  QUARTERLY.REAL.GDP
##  Min.   :64.92                  Min.   :37860   Min.   :13477     
##  1st Qu.:65.62                  1st Qu.:46977   1st Qu.:15305     
##  Median :66.74                  Median :51554   Median :16254     
##  Mean   :66.41                  Mean   :52896   Mean   :16536     
##  3rd Qu.:67.13                  3rd Qu.:58745   3rd Qu.:17897     
##  Max.   :67.30                  Max.   :74737   Max.   :19806     
##  NA's   :5                                                        
##  QUARTERLY.GDP.GROWTH.RATE....   CSUSHPISA    
##  Min.   :-8.9373               Min.   :120.7  
##  1st Qu.: 0.2936               1st Qu.:147.4  
##  Median : 0.5800               Median :169.8  
##  Mean   : 0.4901               Mean   :175.3  
##  3rd Qu.: 0.8339               3rd Qu.:189.7  
##  Max.   : 7.5475               Max.   :304.8  
## 
#numerical characteristics from the summary, along with missing values, additional quantile data, and an inline histogram for each variable
skimmed_dataset = skim(dataset)
skimmed_dataset
Data summary
Name dataset
Number of rows 241
Number of columns 15
_______________________
Column type frequency:
character 1
numeric 14
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
DATE 0 1 10 10 0 241 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
UNRATE… 0 1.00 6.07 1.99 3.50 4.70 5.60 7.30 14.70 ▇▃▃▁▁
CONSUMER.CONF.INDEX 0 1.00 90.81 25.87 25.00 70.40 94.50 108.20 138.40 ▁▅▆▇▅
PPI.CONST.MAT. 0 1.00 206.95 40.48 143.80 183.30 206.20 221.70 352.86 ▅▇▂▁▁
CPIALLITEMS 0 1.00 95.54 11.09 75.86 87.72 96.82 103.26 123.32 ▅▆▇▅▁
INFLATION… 0 1.00 2.30 1.64 -2.10 1.46 2.07 2.97 8.58 ▁▇▆▁▁
MORTGAGE.INT..MONTHLY.AVG… 0 1.00 4.70 1.12 2.68 3.80 4.46 5.81 6.81 ▃▇▅▃▅
MED.HOUSEHOLD.INCOME 17 0.93 53273.98 7475.32 42409.00 49007.25 50303.00 59039.00 68703.00 ▅▇▃▃▂
CORP..BOND.YIELD… 0 1.00 4.47 1.08 2.14 3.69 4.34 5.41 6.75 ▃▆▃▇▂
MONTHLY.HOME.SUPPLY 0 1.00 5.97 1.90 3.30 4.60 5.50 6.70 12.20 ▇▇▃▁▁
X..SHARE.OF.WORKING.POPULATION 5 0.98 66.41 0.80 64.92 65.62 66.74 67.13 67.30 ▃▁▃▃▇
GDP.PER.CAPITA 0 1.00 52896.08 8840.59 37860.00 46977.00 51554.00 58745.00 74737.00 ▅▇▅▃▁
QUARTERLY.REAL.GDP 0 1.00 16536.01 1708.44 13477.36 15304.52 16253.73 17896.62 19806.29 ▃▇▅▃▅
QUARTERLY.GDP.GROWTH.RATE…. 0 1.00 0.49 1.45 -8.94 0.29 0.58 0.83 7.55 ▁▁▇▂▁
CSUSHPISA 0 1.00 175.31 36.78 120.72 147.40 169.81 189.71 304.83 ▇▇▃▁▁
#DataExplorer::create_report(dataset)

‘Part 2: manipulate, wrangle, visualise data’

#Time series plot of inflation. 
dataset$DATE <- as.Date(dataset$DATE, format="%d-%m-%Y")
timeplot <- ggplot(dataset, aes(x=DATE, y=INFLATION...)) +
       geom_line()
timeplot

temp <- data.frame(
                 month = as.numeric(format(dataset$DATE, format = "%m")),
                 year = as.numeric(format(dataset$DATE, format = "%Y")))
#Splitting the char variable date into numerical variables month and year and creading a new dataset
#Removing Date variable from the dataset:
processedDataset <- dataset[-c(1)]
processedDataset <- cbind(temp,processedDataset)
#setting seed value for reproducibility 
set.seed(1902)
#splitting data set into test and train set (70:30 split)
subset <- sample(nrow(processedDataset), nrow(processedDataset) * 0.7,replace=FALSE)
train <- processedDataset[subset, ]
nrow(train) / nrow(processedDataset)
## [1] 0.6970954
test <- processedDataset[-subset, ]
nrow(test) / nrow(processedDataset)
## [1] 0.3029046
dim(train)
## [1] 168  16
dim(test)
## [1] 73 16
#filling missing values in the training and test set(imputation using KNN)
train <-kNN(train)
test <-kNN(test)
# additional variables are created after performing knn imputation, Step to remove these extra variables:
train = subset(train, select = c(1:(ncol(train)/2)))
test = subset(test, select = c(1:(ncol(test)/2)))

skim(train)
Data summary
Name train
Number of rows 168
Number of columns 16
_______________________
Column type frequency:
numeric 16
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
month 0 1 6.63 3.55 1.00 4.00 6.50 10.00 12.00 ▇▅▅▅▇
year 0 1 2010.95 5.79 2002.00 2006.00 2010.00 2016.00 2022.00 ▇▆▅▅▃
UNRATE… 0 1 6.07 1.81 3.50 4.70 5.60 7.30 9.90 ▆▇▂▂▃
CONSUMER.CONF.INDEX 0 1 88.70 25.95 25.00 70.05 92.75 106.85 135.70 ▂▅▅▇▅
PPI.CONST.MAT. 0 1 201.87 39.64 143.80 179.80 201.10 215.35 345.85 ▅▇▂▁▁
CPIALLITEMS 0 1 93.80 10.99 75.86 85.11 92.62 101.40 121.30 ▇▇▇▅▂
INFLATION… 0 1 2.36 1.55 -2.10 1.51 2.08 3.16 8.54 ▁▇▆▁▁
MORTGAGE.INT..MONTHLY.AVG… 0 1 4.83 1.14 2.68 3.91 4.61 5.92 6.81 ▃▇▅▅▆
MED.HOUSEHOLD.INCOME 0 1 53063.36 7873.49 42409.00 48201.00 50303.00 59039.00 68703.00 ▅▇▃▃▃
CORP..BOND.YIELD… 0 1 4.63 1.07 2.25 3.80 4.88 5.47 6.75 ▂▆▃▇▂
MONTHLY.HOME.SUPPLY 0 1 5.98 1.95 3.30 4.50 5.50 6.75 11.60 ▇▇▃▂▂
X..SHARE.OF.WORKING.POPULATION 0 1 66.49 0.78 64.92 65.88 66.74 67.13 67.30 ▃▁▂▃▇
GDP.PER.CAPITA 0 1 51593.26 8718.61 37860.00 46089.00 49256.00 57493.00 73289.00 ▅▇▃▃▂
QUARTERLY.REAL.GDP 0 1 16288.91 1704.46 13477.36 15216.65 15792.77 17645.06 19806.29 ▃▇▃▃▃
QUARTERLY.GDP.GROWTH.RATE…. 0 1 0.61 1.00 -2.18 0.32 0.59 0.85 7.55 ▁▇▁▁▁
CSUSHPISA 0 1 180.37 39.58 122.89 148.07 173.74 201.07 304.83 ▆▇▃▁▁
skim(test)
Data summary
Name test
Number of rows 73
Number of columns 16
_______________________
Column type frequency:
numeric 16
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
month 0 1 6.18 3.23 1.00 3.00 6.00 9.00 12.00 ▇▅▆▆▅
year 0 1 2014.00 5.35 2003.00 2010.00 2015.00 2019.00 2022.00 ▃▅▇▇▇
UNRATE… 0 1 6.09 2.36 3.50 4.40 5.40 7.50 14.70 ▇▃▂▁▁
CONSUMER.CONF.INDEX 0 1 95.66 25.19 37.70 79.70 98.00 111.20 138.40 ▂▅▇▇▆
PPI.CONST.MAT. 0 1 218.64 40.23 147.00 195.90 213.30 233.80 352.86 ▃▇▅▁▁
CPIALLITEMS 0 1 99.55 10.30 77.59 92.47 100.35 106.69 123.32 ▂▅▇▆▂
INFLATION… 0 1 2.14 1.85 -1.29 1.17 1.97 2.70 8.58 ▂▇▃▁▁
MORTGAGE.INT..MONTHLY.AVG… 0 1 4.39 1.02 2.77 3.69 4.17 4.95 6.70 ▃▇▅▂▂
MED.HOUSEHOLD.INCOME 0 1 57087.07 7959.40 43318.00 50233.00 56516.00 63179.00 68703.00 ▃▇▆▆▇
CORP..BOND.YIELD… 0 1 4.12 1.03 2.14 3.50 3.95 5.05 6.01 ▃▅▇▃▅
MONTHLY.HOME.SUPPLY 0 1 5.96 1.77 3.40 4.90 5.50 6.70 12.20 ▆▇▂▁▁
X..SHARE.OF.WORKING.POPULATION 0 1 66.13 0.85 64.92 65.35 66.11 66.92 67.30 ▇▁▅▃▇
GDP.PER.CAPITA 0 1 55894.36 8434.15 39752.00 49256.00 56018.00 62786.00 74737.00 ▂▇▇▆▂
QUARTERLY.REAL.GDP 0 1 17104.67 1587.57 13970.16 15769.91 17258.21 18679.60 19806.29 ▃▇▇▃▇
QUARTERLY.GDP.GROWTH.RATE…. 0 1 0.21 2.15 -8.94 0.17 0.57 0.79 7.55 ▁▁▇▂▁
CSUSHPISA 0 1 163.65 26.03 120.72 144.58 158.23 180.91 245.80 ▇▇▆▂▁
#Visualisation of Data:
pairs(processedDataset)

boxplot(processedDataset)

M = cor(rbind(test,train))
corrplot(M, method = 'number',tl.cex=0.5)

#Standardizing/Scaling data 
train_unscaled <- train
train<-scale(train, center = TRUE, scale = TRUE)
train<-as.data.frame(train)
test_unscaled<-test
test<-scale(test, center = TRUE, scale = TRUE)
test<-as.data.frame(test)

‘Part 3: Running Models’

#Using forward selection to identify significant parameters
set.seed(1902)
intercept_only <- lm(INFLATION... ~ 1, data=train)
all <- lm(INFLATION... ~ ., data=train)
forward <- step(intercept_only, direction='forward', scope=formula(all), trace=0)
forward$anova
##                      Step Df   Deviance Resid. Df Resid. Dev          AIC
## 1                         NA         NA       167  167.00000    0.9970119
## 2        + PPI.CONST.MAT. -1 16.5568698       166  150.44313  -14.5436450
## 3                  + year -1 70.3561888       165   80.08694 -118.4629962
## 4    + QUARTERLY.REAL.GDP -1 12.9654830       164   67.12146 -146.1333120
## 5                 + month -1  3.8844731       163   63.23699 -154.1485395
## 6           + CPIALLITEMS -1  6.3584614       162   56.87852 -169.9517526
## 7   + MONTHLY.HOME.SUPPLY -1  7.7652427       161   49.11328 -192.6121941
## 8             + CSUSHPISA -1  2.3440123       160   46.76927 -198.8279237
## 9        + GDP.PER.CAPITA -1  3.1841301       159   43.58514 -208.6736205
## 10  + CONSUMER.CONF.INDEX -1  1.2780463       158   42.30709 -211.6735520
## 11            + UNRATE... -1  0.8468738       157   41.46022 -213.0705728
## 12 + MED.HOUSEHOLD.INCOME -1  0.5351031       156   40.92512 -213.2529659
forward$coefficients
##          (Intercept)       PPI.CONST.MAT.                 year 
##        -1.275481e-14         1.215319e+00        -7.430751e+00 
##   QUARTERLY.REAL.GDP                month          CPIALLITEMS 
##         6.457188e+00        -3.846131e-01         4.599175e+00 
##  MONTHLY.HOME.SUPPLY            CSUSHPISA       GDP.PER.CAPITA 
##        -3.238516e-01         6.816169e-01        -3.551850e+00 
##  CONSUMER.CONF.INDEX            UNRATE... MED.HOUSEHOLD.INCOME 
##        -2.457044e-01         2.579252e-01        -4.553320e-01
# From the above data we see that the following predictors are the most statistically significant


RSQUARE = function(y_actual,y_predict){
  cor(y_actual,y_predict)^2
}
#Linear Regression Model
set.seed(1902)
linearModel = lm(INFLATION... ~  PPI.CONST.MAT. + CPIALLITEMS + MORTGAGE.INT..MONTHLY.AVG... + CORP..BOND.YIELD... + MED.HOUSEHOLD.INCOME   + CSUSHPISA + MONTHLY.HOME.SUPPLY   + X..SHARE.OF.WORKING.POPULATION, data = train) 
summary(linearModel)
## 
## Call:
## lm(formula = INFLATION... ~ PPI.CONST.MAT. + CPIALLITEMS + MORTGAGE.INT..MONTHLY.AVG... + 
##     CORP..BOND.YIELD... + MED.HOUSEHOLD.INCOME + CSUSHPISA + 
##     MONTHLY.HOME.SUPPLY + X..SHARE.OF.WORKING.POPULATION, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.38890 -0.29445  0.02902  0.34231  1.30279 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -3.733e-15  4.834e-02   0.000 1.000000    
## PPI.CONST.MAT.                  1.715e+00  1.756e-01   9.769  < 2e-16 ***
## CPIALLITEMS                     4.643e-01  3.837e-01   1.210 0.228018    
## MORTGAGE.INT..MONTHLY.AVG...    1.203e+00  2.084e-01   5.772 3.99e-08 ***
## CORP..BOND.YIELD...            -8.827e-01  2.404e-01  -3.671 0.000329 ***
## MED.HOUSEHOLD.INCOME           -7.357e-01  3.267e-01  -2.252 0.025673 *  
## CSUSHPISA                       6.262e-01  2.170e-01   2.886 0.004450 ** 
## MONTHLY.HOME.SUPPLY            -2.471e-01  9.888e-02  -2.499 0.013456 *  
## X..SHARE.OF.WORKING.POPULATION  6.712e-01  3.127e-01   2.147 0.033346 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6265 on 159 degrees of freedom
## Multiple R-squared:  0.6263, Adjusted R-squared:  0.6075 
## F-statistic:  33.3 on 8 and 159 DF,  p-value: < 2.2e-16
linearPred <- predict(linearModel, test)
lm_rmse <- rmse(test$INFLATION..., linearPred) #0.5718129
AIC(linearModel)
## [1] 330.4117
BIC(linearModel)
## [1] 361.6513
#Ridge Regression Model

set.seed(1902)
#define response variable
y <- train$INFLATION...
#define matrix of predictor variables
x <- data.matrix(train[, c('PPI.CONST.MAT.','CPIALLITEMS', 'MORTGAGE.INT..MONTHLY.AVG...', 'CORP..BOND.YIELD...', 'MED.HOUSEHOLD.INCOME', 'CSUSHPISA', 'MONTHLY.HOME.SUPPLY', 'X..SHARE.OF.WORKING.POPULATION')])
xtest <- data.matrix(test[, c('PPI.CONST.MAT.','CPIALLITEMS', 'MORTGAGE.INT..MONTHLY.AVG...', 'CORP..BOND.YIELD...', 'MED.HOUSEHOLD.INCOME', 'CSUSHPISA', 'MONTHLY.HOME.SUPPLY', 'X..SHARE.OF.WORKING.POPULATION')])
ytest <- test$INFLATION...
#perform k-fold cross-validation to find optimal lambda value
crossModelRidge <- cv.glmnet(x, y, alpha = 0)
lambdas <- 10^seq(2, -3, by = -.1)
ridgeRegression <- glmnet(x, y, alpha = 0,family = 'gaussian', lambda = lambdas)
summary(ridgeRegression)
##           Length Class     Mode   
## a0         51    -none-    numeric
## beta      408    dgCMatrix S4     
## df         51    -none-    numeric
## dim         2    -none-    numeric
## lambda     51    -none-    numeric
## dev.ratio  51    -none-    numeric
## nulldev     1    -none-    numeric
## npasses     1    -none-    numeric
## jerr        1    -none-    numeric
## offset      1    -none-    logical
## call        6    -none-    call   
## nobs        1    -none-    numeric
#find optimal lambda value that minimizes test MSE
bestLambda <- crossModelRidge$lambda.min
bestLambda
## [1] 0.03139312
#produce plot of test MSE by lambda value
plot(crossModelRidge) 

#find coefficients of best model
bestModel <- glmnet(x, y, alpha = 0, lambda = bestLambda)
coef(bestModel)
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                                           s0
## (Intercept)                    -2.013068e-15
## PPI.CONST.MAT.                  1.350516e+00
## CPIALLITEMS                     6.493612e-02
## MORTGAGE.INT..MONTHLY.AVG...    8.674415e-01
## CORP..BOND.YIELD...            -3.103988e-01
## MED.HOUSEHOLD.INCOME           -3.238406e-01
## CSUSHPISA                       2.282555e-01
## MONTHLY.HOME.SUPPLY            -2.236387e-01
## X..SHARE.OF.WORKING.POPULATION  3.424238e-01
plot(ridgeRegression, xvar = "lambda")

#use fitted best model to make predictions
yPredicted <- predict(ridgeRegression, s = bestLambda, newx = xtest)
ridge_rmse <-rmse(ytest,yPredicted) #0.4914431
#Lasso Regression Model
set.seed(1902)
#perform k-fold cross-validation to find optimal lambda value
crossModel2 <- cv.glmnet(x, y, alpha = 1)

#find optimal lambda value that minimizes test MSE
bestLambda2 <- crossModel2$lambda.min
bestLambda2
## [1] 0.0002430655
#produce plot of test MSE by lambda value
plot(crossModel2) 

#find coefficients of best model
bestModel2 <- glmnet(x, y, alpha = 1, lambda = bestLambda2)
coef(bestModel2)
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                                           s0
## (Intercept)                    -3.973851e-15
## PPI.CONST.MAT.                  1.712321e+00
## CPIALLITEMS                     4.500882e-01
## MORTGAGE.INT..MONTHLY.AVG...    1.199470e+00
## CORP..BOND.YIELD...            -8.769051e-01
## MED.HOUSEHOLD.INCOME           -7.299107e-01
## CSUSHPISA                       6.174392e-01
## MONTHLY.HOME.SUPPLY            -2.455779e-01
## X..SHARE.OF.WORKING.POPULATION  6.626533e-01
#use fitted best model to make predictions
yPredicted2 <- predict(bestModel2, s = bestLambda2, newx = xtest)
lasso_rmse <-rmse(ytest,yPredicted2) #0.5697695
#Random Forest Model
set.seed(1902)
 randomForestModel <- randomForest(INFLATION... ~  PPI.CONST.MAT. + CPIALLITEMS + MORTGAGE.INT..MONTHLY.AVG... + CORP..BOND.YIELD... + MED.HOUSEHOLD.INCOME + CSUSHPISA + MONTHLY.HOME.SUPPLY   + X..SHARE.OF.WORKING.POPULATION, data = train, mtry = 3, importance = TRUE, na.action = na.omit)
randomForestModel
## 
## Call:
##  randomForest(formula = INFLATION... ~ PPI.CONST.MAT. + CPIALLITEMS +      MORTGAGE.INT..MONTHLY.AVG... + CORP..BOND.YIELD... + MED.HOUSEHOLD.INCOME +      CSUSHPISA + MONTHLY.HOME.SUPPLY + X..SHARE.OF.WORKING.POPULATION,      data = train, mtry = 3, importance = TRUE, na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.1414103
##                     % Var explained: 85.77
plot(randomForestModel)

summary(randomForestModel)
##                 Length Class  Mode     
## call              6    -none- call     
## type              1    -none- character
## predicted       168    -none- numeric  
## mse             500    -none- numeric  
## rsq             500    -none- numeric  
## oob.times       168    -none- numeric  
## importance       16    -none- numeric  
## importanceSD      8    -none- numeric  
## localImportance   0    -none- NULL     
## proximity         0    -none- NULL     
## ntree             1    -none- numeric  
## mtry              1    -none- numeric  
## forest           11    -none- list     
## coefs             0    -none- NULL     
## y               168    -none- numeric  
## test              0    -none- NULL     
## inbag             0    -none- NULL     
## terms             3    terms  call
rfPredictions = predict(randomForestModel, test)
mean((rfPredictions - test$INFLATION...)^2)
## [1] 0.4569684
rf_rmse<-rmse(test$INFLATION..., rfPredictions) #0.6759944
#XGBOOST MODEL
set.seed(1902)
train_x = data.matrix(train[, -7])
train_y = train[,7]
test_x = data.matrix(test[, -7])
test_y = test[, 7]
xgb_train = xgb.DMatrix(data = train_x, label = train_y)
xgb_test = xgb.DMatrix(data = test_x, label = test_y)
watchlist = list(train=xgb_train, test=xgb_test)
model = xgb.train(data = xgb_train, max.depth = 3, watchlist=watchlist, nrounds = 70) #From the output we can see that the minimum testing RMSE is achieved at 26 rounds.
## [1]  train-rmse:0.897246 test-rmse:0.866098 
## [2]  train-rmse:0.739680 test-rmse:0.767033 
## [3]  train-rmse:0.640073 test-rmse:0.680113 
## [4]  train-rmse:0.571949 test-rmse:0.623681 
## [5]  train-rmse:0.523814 test-rmse:0.590589 
## [6]  train-rmse:0.475864 test-rmse:0.624883 
## [7]  train-rmse:0.446896 test-rmse:0.617974 
## [8]  train-rmse:0.395943 test-rmse:0.566077 
## [9]  train-rmse:0.360114 test-rmse:0.552384 
## [10] train-rmse:0.331065 test-rmse:0.551328 
## [11] train-rmse:0.308852 test-rmse:0.560815 
## [12] train-rmse:0.286277 test-rmse:0.575717 
## [13] train-rmse:0.272002 test-rmse:0.570884 
## [14] train-rmse:0.263641 test-rmse:0.570806 
## [15] train-rmse:0.255383 test-rmse:0.570363 
## [16] train-rmse:0.245911 test-rmse:0.568605 
## [17] train-rmse:0.237166 test-rmse:0.567948 
## [18] train-rmse:0.225343 test-rmse:0.566148 
## [19] train-rmse:0.212451 test-rmse:0.568949 
## [20] train-rmse:0.198938 test-rmse:0.567187 
## [21] train-rmse:0.194421 test-rmse:0.564668 
## [22] train-rmse:0.182341 test-rmse:0.559164 
## [23] train-rmse:0.169826 test-rmse:0.554380 
## [24] train-rmse:0.164237 test-rmse:0.558524 
## [25] train-rmse:0.155566 test-rmse:0.556023 
## [26] train-rmse:0.148065 test-rmse:0.557349 
## [27] train-rmse:0.146544 test-rmse:0.557649 
## [28] train-rmse:0.139689 test-rmse:0.560763 
## [29] train-rmse:0.134906 test-rmse:0.563738 
## [30] train-rmse:0.131989 test-rmse:0.565147 
## [31] train-rmse:0.124874 test-rmse:0.562844 
## [32] train-rmse:0.122828 test-rmse:0.563370 
## [33] train-rmse:0.121044 test-rmse:0.564114 
## [34] train-rmse:0.118629 test-rmse:0.564537 
## [35] train-rmse:0.114708 test-rmse:0.563925 
## [36] train-rmse:0.112203 test-rmse:0.565329 
## [37] train-rmse:0.107976 test-rmse:0.565435 
## [38] train-rmse:0.104595 test-rmse:0.564920 
## [39] train-rmse:0.100645 test-rmse:0.565461 
## [40] train-rmse:0.098298 test-rmse:0.567951 
## [41] train-rmse:0.094117 test-rmse:0.566983 
## [42] train-rmse:0.092052 test-rmse:0.566956 
## [43] train-rmse:0.088535 test-rmse:0.566991 
## [44] train-rmse:0.085178 test-rmse:0.566165 
## [45] train-rmse:0.082362 test-rmse:0.566716 
## [46] train-rmse:0.079926 test-rmse:0.566752 
## [47] train-rmse:0.078685 test-rmse:0.567498 
## [48] train-rmse:0.074283 test-rmse:0.566310 
## [49] train-rmse:0.071731 test-rmse:0.566383 
## [50] train-rmse:0.070841 test-rmse:0.566452 
## [51] train-rmse:0.070058 test-rmse:0.566555 
## [52] train-rmse:0.068420 test-rmse:0.566996 
## [53] train-rmse:0.067011 test-rmse:0.565867 
## [54] train-rmse:0.066064 test-rmse:0.566169 
## [55] train-rmse:0.063994 test-rmse:0.564853 
## [56] train-rmse:0.062709 test-rmse:0.565801 
## [57] train-rmse:0.061567 test-rmse:0.565276 
## [58] train-rmse:0.059644 test-rmse:0.567288 
## [59] train-rmse:0.058105 test-rmse:0.568088 
## [60] train-rmse:0.056176 test-rmse:0.568228 
## [61] train-rmse:0.055627 test-rmse:0.568421 
## [62] train-rmse:0.054124 test-rmse:0.568500 
## [63] train-rmse:0.053086 test-rmse:0.568598 
## [64] train-rmse:0.051967 test-rmse:0.569344 
## [65] train-rmse:0.049996 test-rmse:0.567794 
## [66] train-rmse:0.049632 test-rmse:0.568462 
## [67] train-rmse:0.049237 test-rmse:0.567722 
## [68] train-rmse:0.048476 test-rmse:0.567442 
## [69] train-rmse:0.047141 test-rmse:0.568092 
## [70] train-rmse:0.045849 test-rmse:0.566548
final = xgboost(data = xgb_train, max.depth = 3, nrounds = 26, verbose = 0) #we’ll define our final XGBoost model to use 26 rounds
xgb.plot.tree(model = final, trees = 1:3)
pred_y = predict(final, test_x)
mean((test_y - pred_y)^2) #mse
## [1] 0.3106376
MAE(test_y, pred_y) #mae
## [1] 0.4356466
xg_rmse<-rmse(test_y, pred_y) #rmse 0.5573487
#RMSE Comparison for the models.
x= c('lm','ridge','lasso','rf','xgboost')
y= c(lm_rmse,ridge_rmse,lasso_rmse,rf_rmse,xg_rmse)
sample_data <- data.frame(name = c('lm','ridge','lasso','rf','xgboost') ,
                          value = c(lm_rmse,ridge_rmse,lasso_rmse,rf_rmse,xg_rmse))
plot<-ggplot(sample_data,
             aes(name,value)) +
geom_bar(stat = "identity") + xlab("ML Model")+ylab("RMSE Value")
plot 

#The plot shows that Ridge regression works best for the following dataset to predict inflation.
#Comparison of actual test values vs values predicted using Ridge Regression Model.
df1 = data.frame(Value_Type =rep (c("Actual"),length(test$INFLATION...)),inflation = test$INFLATION...,year=test_unscaled$year)
df2 = data.frame(Value_Type =rep (c("Predicted"),length(yPredicted)),inflation = yPredicted,year=test_unscaled$year)
df2<- rename(df2,inflation=s1)
final_df = union_all(df1,df2)
plt <- ggplot(data=final_df, aes(x=year, y=inflation, color=Value_Type))+
       geom_line()+
       ggtitle("Inflation Prediction")
plt